In [1]:
% matplotlib nbagg
import numpy as np
from sklearn import datasets
import random
random.seed(3222)
np.random.seed(3222)


/usr/local/lib/python2.7/dist-packages/IPython/kernel/__init__.py:13: ShimWarning: The `IPython.kernel` package has been deprecated. You should import from ipykernel or jupyter_client instead.
  "You should import from ipykernel or jupyter_client instead.", ShimWarning)

Load Data and split into train and test 50/50


In [2]:
movies = datasets.load_files("txt_sentoken")

In [3]:
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(movies.data, movies.target, test_size=0.5)

Trying K-Nearest Neighbors Classifier


In [4]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.neighbors import KNeighborsClassifier
text_clf = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf',  KNeighborsClassifier()),
])

In [5]:
text_clf = text_clf.fit(X_train, y_train)
predicted = text_clf.predict(X_test)
np.mean(predicted == y_test)


Out[5]:
0.54200000000000004

In [6]:
from sklearn.grid_search import GridSearchCV
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
               'tfidf__use_idf': (True, False),
               'clf__n_neighbors': (1,2,3,4,5),
}
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf = gs_clf.fit(X_train, y_train)
best_parameters, score, _ = max(gs_clf.grid_scores_, key=lambda x: x[1])
for param_name in sorted(parameters.keys()):
     print("%s: %r" % (param_name, best_parameters[param_name]))

score


clf__n_neighbors: 2
tfidf__use_idf: True
vect__ngram_range: (1, 1)
Out[6]:
0.61499999999999999

In [7]:
text_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,1))),
                      ('tfidf', TfidfTransformer(use_idf=True)),
                      ('clf',  KNeighborsClassifier(n_neighbors=2)),
])
text_clf = text_clf.fit(X_train, y_train)
predicted = text_clf.predict(X_test)
np.mean(predicted == y_test)


Out[7]:
0.63200000000000001

In [8]:
movies.target_names[text_clf.predict(["life is good"])[0]]


Out[8]:
'pos'

In [9]:
movies.target_names[text_clf.predict(["this sucks"])[0]]


Out[9]:
'neg'

Trying Naive Bayes


In [10]:
from sklearn.naive_bayes import MultinomialNB
text_clf = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf',  MultinomialNB()),
])

In [11]:
text_clf = text_clf.fit(X_train, y_train)
predicted = text_clf.predict(X_test)
np.mean(predicted == y_test)


Out[11]:
0.80300000000000005

Trying Stochastic Gradient Descent


In [24]:
from sklearn.linear_model import SGDClassifier
text_clf = Pipeline([('vect', CountVectorizer()),
                      ('tfidf', TfidfTransformer()),
                      ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                            alpha=1e-3, n_iter=10, random_state=42)),
])
text_clf = text_clf.fit(X_train, y_train)
predicted = text_clf.predict(X_test)
np.mean(predicted == y_test)


Out[24]:
0.82299999999999995

In [27]:
parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
               'tfidf__use_idf': (True, False),
               'clf__alpha': (1e-3, 1e-4),
               'clf__loss': ('hinge', 'squared_loss', 'epsilon_insensitive', 'log'),
}
gs_clf = GridSearchCV(text_clf, parameters, n_jobs=-1)
gs_clf.fit(X_train, y_train)
gs_clf.best_params_
predicted = gs_clf.predict(X_test)
gs_clf.score(X_test, y_test)
#np.mean(predicted == y_test)


Out[27]:
0.82499999999999996

Trying SVM linear kernel


In [22]:
from sklearn.svm import SVC
text_clf = Pipeline([('vect', CountVectorizer(ngram_range=(1,1))),
                     ('tfidf', TfidfTransformer(use_idf=True)),
                     ('clf', SVC(kernel='linear')),
           ])
text_clf = text_clf.fit(X_train, y_train)
predicted = text_clf.predict(X_test)
np.mean(predicted == y_test)


Out[22]:
0.82699999999999996

In [ ]: